Concrete Stregth Prediction¶

import warnings
warnings.filterwarnings('ignore')
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use('ggplot')
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable

Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable

Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable

Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable

Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable

Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable

Fine Aggregate (component 7) -- quantitative -- kg in a m3 mixture -- Input Variable

Age -- quantitative -- Day (1~365) -- Input Variable

Concrete compressive strength -- quantitative -- MPa -- Output Variable

# importing data
df = pd.read_csv("concrete.csv")
df.head()      # used to see top 5 rows of the dataset

1. Univariate analysis (10 )¶

#five point analysis
df.describe().transpose()

#checking for datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cement        1030 non-null   float64
 1   slag          1030 non-null   float64
 2   ash           1030 non-null   float64
 3   water         1030 non-null   float64
 4   superplastic  1030 non-null   float64
 5   coarseagg     1030 non-null   float64
 6   fineagg       1030 non-null   float64
 7   age           1030 non-null   int64  
 8   strength      1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB

#no of rows and columns
df.shape

(1030, 9)

#checking for null values
df.isnull().values.any()

False

#checking for any skewed values
df.skew()

cement          0.509481
slag            0.800717
ash             0.537354
water           0.074628
superplastic    0.907203
coarseagg      -0.040220
fineagg        -0.253010
age             3.269177
strength        0.416977
dtype: float64

#visualizing age skewness using a box plot
sns.boxplot(df['age'])

<matplotlib.axes._subplots.AxesSubplot at 0x2185c3a6ac8>

sns.distplot(df['age'])
# Age value less than 365 is allowed since the age value in this can range (1~365)

<matplotlib.axes._subplots.AxesSubplot at 0x2185c3d2b88>

sns.distplot(df['fineagg'])

<matplotlib.axes._subplots.AxesSubplot at 0x2185c4e6688>

sns.distplot(df['coarseagg'])

<matplotlib.axes._subplots.AxesSubplot at 0x2185c587988>

sns.distplot(df['superplastic'])

<matplotlib.axes._subplots.AxesSubplot at 0x2185c610088>

sns.distplot(df['water'])

<matplotlib.axes._subplots.AxesSubplot at 0x2185c6c6688>

sns.distplot(df['ash'])

<matplotlib.axes._subplots.AxesSubplot at 0x2185c772d48>

sns.distplot(df['slag'])

<matplotlib.axes._subplots.AxesSubplot at 0x2185c7b9448>

sns.distplot(df['cement'])

<matplotlib.axes._subplots.AxesSubplot at 0x2185c88d888>

sns.distplot(df['strength'], kde= True)

<matplotlib.axes._subplots.AxesSubplot at 0x2185c8fda88>

2.Bivariate Analysis (10 )¶

sns.pairplot(df)
#From the pariplot as cement increases strength also increase

<seaborn.axisgrid.PairGrid at 0x2185c8f8288>

# visualise area-price relationship
sns.regplot(x="cement", y="strength", data=df, fit_reg=False)

<matplotlib.axes._subplots.AxesSubplot at 0x2185f60b2c8>

sns.regplot(x="slag", y="strength", data=df, fit_reg=False)

<matplotlib.axes._subplots.AxesSubplot at 0x21861191888>

sns.regplot(x="ash", y="strength", data=df, fit_reg=False)

<matplotlib.axes._subplots.AxesSubplot at 0x218611cd888>

sns.regplot(x="water", y="strength", data=df, fit_reg=False)

<matplotlib.axes._subplots.AxesSubplot at 0x21861cd7f08>

sns.regplot(x="superplastic", y="strength", data=df, fit_reg=False)

<matplotlib.axes._subplots.AxesSubplot at 0x2185c3dc288>

sns.regplot(x="coarseagg", y="strength", data=df, fit_reg=False)

<matplotlib.axes._subplots.AxesSubplot at 0x2185c36b388>

sns.regplot(x="fineagg", y="strength", data=df, fit_reg=False)

<matplotlib.axes._subplots.AxesSubplot at 0x21861dd8e08>

sns.regplot(x="age", y="strength", data=df, fit_reg=False)

<matplotlib.axes._subplots.AxesSubplot at 0x21861e49a88>

3. Feature engineering techniques (10 )¶

#Checking for highly corelaed variables
df.corr()
#no highly correlated independent variables so planning to keep all varibales as it is.

plt.figure(figsize=(10,8))

sns.heatmap(df.corr(),
            annot=True,
            linewidths=.5,
            center=0,
            cbar=False)

plt.show()

# Copy all the predictor variables into X dataframe. Since 'strength' is dependent variable drop it
X = df.drop('strength', axis=1)

# Copy the 'strength' column alone into the y dataframe. This is the dependent variable
y = df[['strength']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

#Using polynomial Features to create more independent variables 
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)

4 .Creating the model and tuning it (30 )¶

#Linear Regression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
regression_model.score(X_test, y_test)

0.7444710081439875

#Ridge
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.00000000e+00  3.34737287e-01 -8.90485076e-02 -6.30434183e-02
   1.32035147e+00 -5.86750605e+00 -1.02711861e-01 -2.79869108e-01
  -1.60696328e-01  8.73200298e-05  1.77720733e-04 -1.76103896e-03
  -3.06993810e-03  4.03666779e-05  7.44596754e-05  4.39670032e-04
   3.25685432e-04 -8.96567438e-04  6.94464692e-04  5.50193577e-05
   3.02286525e-04  7.80669809e-04 -1.70452413e-03 -6.47920159e-03
   5.66003664e-05  3.90510372e-04  1.67035513e-03  1.45558038e-02
  -6.68676049e-04 -1.90665156e-04 -7.25230464e-04  3.55321606e-03
   1.74081215e-03  6.37347701e-03  2.61315740e-04 -6.42094713e-05
   3.62849389e-04]]

#Lasso ; Just seeing if we can drop any independent variable.
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

Lasso model: [ 0.00000000e+00  2.05754116e-01 -3.87982396e-02 -4.35284753e-01
  1.23746835e+00  1.33809450e-01 -5.29179020e-02 -1.51825356e-01
 -2.78460812e-01  8.95660513e-05  2.60779047e-04 -1.33378929e-03
 -3.95450528e-03  7.18779238e-05  1.00597102e-04  3.72222764e-04
  3.88802403e-04 -8.09034433e-04 -1.02471621e-03  2.93639011e-05
  2.50952391e-04  7.37821730e-04 -9.66548234e-04 -7.33838283e-03
  2.20040160e-04  4.67341433e-04  1.55572176e-03  9.50087061e-03
 -6.25908111e-04 -3.76717225e-04 -3.77843298e-04  1.15780298e-03
 -1.45978046e-03  8.20078420e-03  1.70095680e-04  3.36183515e-05
  3.31263963e-04]

print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.7549507401174544
0.744826902205591

print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
#The test scores are comparable between simple and quadratic models. Scores are coming around 60%. We have to try other algorithms

0.7525566410009125
0.7479043301769873

dtRegressor = DecisionTreeRegressor(random_state=0)
dtRegressor.fit(X_train,y_train)
print(dtRegressor.score(X_train, y_train))
print(dtRegressor.score(X_test, y_test))
#Decsion tree regressor is giving the required scores before applying cross validation

0.9948592423407845
0.8250944096509519

randomForestRegressor = RandomForestRegressor(max_depth=2, random_state=0)
randomForestRegressor.fit(X_train,y_train)
print(randomForestRegressor.score(X_train, y_train))
print(randomForestRegressor.score(X_test, y_test))

0.6769159140478779
0.6310812179568712

from sklearn.ensemble import AdaBoostRegressor
abRegressor = AdaBoostRegressor(random_state=0, n_estimators=100)
abRegressor.fit(X_train,y_train)
print(abRegressor.score(X_train, y_train))
print(abRegressor.score(X_test, y_test))

0.8615854235930094
0.8157435134345754

from sklearn.ensemble import GradientBoostingRegressor
gradientRegressor = GradientBoostingRegressor(random_state=0)
gradientRegressor.fit(X_train,y_train)
print(gradientRegressor.score(X_train, y_train))
print(gradientRegressor.score(X_test, y_test))

0.9633315530162665
0.9032674789608472

So far we had got good results in DecisionTreeRegressor and Gradient Boosting Regressor ~82% and ~96% on training.¶

We will take DecisionTreeRegressor and Gradient Boosting Regressor for hyperparameter tuning and cross validation.¶

We will find out the best combination of hyperparameters and then do the KFold crossvalidation on those before concluding on the result¶

DecisionTree Regressor¶

dtRegressor.get_params().keys()

dict_keys(['ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

#We will try to use RandomSearchCV 
samples = 10  # number of random samples 
depths = np.arange(1, 20)
num_leafs = [1, 5, 10, 20]
parameters = [{'max_depth':depths,
              'min_samples_leaf':num_leafs}]
dtRegressorRSCV = RandomizedSearchCV(dtRegressor, parameters, cv=samples)
dtRegressorRSCV.fit(X, y)
print(dtRegressorRSCV.best_params_)

{'min_samples_leaf': 1, 'max_depth': 17}

num_folds = 3
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(dtRegressorRSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.79637042 0.83790681 0.85108882]
Accuracy: 82.846% (2.332%)

#We will try to use GridSearchCV now
samples = 10  # number of random samples 
depths = np.arange(1, 20)
num_leafs = [1, 5, 10, 20]
parameters = {    'min_samples_leaf'    : num_leafs,
                  'max_depth'    : depths
             }
dtRegressorGSCV = GridSearchCV(dtRegressor, param_grid = parameters, scoring='r2', cv=samples)
dtRegressorGSCV.fit(X, y)
print(dtRegressorGSCV.best_params_)

{'max_depth': 15, 'min_samples_leaf': 1}

num_folds = 3
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(dtRegressorGSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.79637042 0.84750034 0.84729649]
Accuracy: 83.039% (2.405%)

Gradient Regressor¶

gradientRegressor.get_params().keys()

dict_keys(['alpha', 'ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_iter_no_change', 'presort', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

#We will try to use RandomSearchCV 
samples = 10  # number of random samples 
depths = np.arange(1, 20)
num_leafs = [ 20, 30 ]
parameters = [{'max_depth':depths,
              'min_samples_leaf':num_leafs}]
gradientRegressorRSCV = RandomizedSearchCV(gradientRegressor, parameters, cv=samples)
gradientRegressorRSCV.fit(X, y)
print(gradientRegressorRSCV.best_params_)

{'min_samples_leaf': 20, 'max_depth': 14}

num_folds = 3
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(gradientRegressorRSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.91283625 0.91428678 0.90510598]
Accuracy: 91.074% (0.403%)

#We will try to use GridSearchCV 
samples = 10  # number of random samples 
depths = np.arange(1, 20)
num_leafs = [ 20 , 30]
parameters = {    'max_depth'    : depths,
                  'min_samples_leaf': num_leafs
             }
gradientRegressorGSCV = GridSearchCV(gradientRegressor, param_grid = parameters, scoring='r2', cv=samples)
gradientRegressorGSCV.fit(X, y)
print(gradientRegressorGSCV.best_params_)

{'max_depth': 16, 'min_samples_leaf': 20}

num_folds = 3
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(gradientRegressorGSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

[0.91306071 0.91559105 0.90228687]
Accuracy: 91.031% (0.577%)

	cement	slag	ash	water	superplastic	coarseagg	fineagg	age	strength
0	141.3	212.0	0.0	203.5	0.0	971.8	748.5	28	29.89
1	168.9	42.2	124.3	158.3	10.8	1080.8	796.2	14	23.51
2	250.0	0.0	95.7	187.4	5.5	956.9	861.2	28	29.22
3	266.0	114.0	0.0	228.0	0.0	932.0	670.0	28	45.85
4	154.8	183.4	0.0	193.3	9.1	1047.4	696.7	28	18.29

	count	mean	std	min	25%	50%	75%	max
cement	1030.0	281.167864	104.506364	102.00	192.375	272.900	350.000	540.0
slag	1030.0	73.895825	86.279342	0.00	0.000	22.000	142.950	359.4
ash	1030.0	54.188350	63.997004	0.00	0.000	0.000	118.300	200.1
water	1030.0	181.567282	21.354219	121.80	164.900	185.000	192.000	247.0
superplastic	1030.0	6.204660	5.973841	0.00	0.000	6.400	10.200	32.2
coarseagg	1030.0	972.918932	77.753954	801.00	932.000	968.000	1029.400	1145.0
fineagg	1030.0	773.580485	80.175980	594.00	730.950	779.500	824.000	992.6
age	1030.0	45.662136	63.169912	1.00	7.000	28.000	56.000	365.0
strength	1030.0	35.817961	16.705742	2.33	23.710	34.445	46.135	82.6

	cement	slag	ash	water	superplastic	coarseagg	fineagg	age	strength
cement	1.000000	-0.275216	-0.397467	-0.081587	0.092386	-0.109349	-0.222718	0.081946	0.497832
slag	-0.275216	1.000000	-0.323580	0.107252	0.043270	-0.283999	-0.281603	-0.044246	0.134829
ash	-0.397467	-0.323580	1.000000	-0.256984	0.377503	-0.009961	0.079108	-0.154371	-0.105755
water	-0.081587	0.107252	-0.256984	1.000000	-0.657533	-0.182294	-0.450661	0.277618	-0.289633
superplastic	0.092386	0.043270	0.377503	-0.657533	1.000000	-0.265999	0.222691	-0.192700	0.366079
coarseagg	-0.109349	-0.283999	-0.009961	-0.182294	-0.265999	1.000000	-0.178481	-0.003016	-0.164935
fineagg	-0.222718	-0.281603	0.079108	-0.450661	0.222691	-0.178481	1.000000	-0.156095	-0.167241
age	0.081946	-0.044246	-0.154371	0.277618	-0.192700	-0.003016	-0.156095	1.000000	0.328873
strength	0.497832	0.134829	-0.105755	-0.289633	0.366079	-0.164935	-0.167241	0.328873	1.000000